/*
program to mask the word structure information
*/

capture program drop mask_it
program mask_it
	args word
	tempfile TEMP1 TEMP2
	tempvar MATCH LENGTH
	gen sortorder=_n
	save `TEMP1', replace
	gen `MATCH'=1
	collapse (sum) `MATCH', by(`word') fast

	drop `MATCH'
	gen identifier=_n
		
	gen `LENGTH'=ustrlen(`word')
	expand `LENGTH'
	sort identifier
	
	bysort id: gen charid=_n
	
	gen char=usubstr(usubstr(`word',charid,charid+1),1,1)
	
	preserve
	gen `MATCH'=1
	collapse (sum) `MATCH', by(char) fast
	drop if ustrregexm(char,"[0-9]")
					
	gen random=runiform()
	so random
	
	gen char_shuffler=_n
	keep char char_shuffler
	ren char char_shuffled
	local char_sum=_N
	save `TEMP2', replace
	restore	
	
	gen char_shuffler=runiformint(1,`char_sum')
	merge m:m char_shuffler using `TEMP2'
	sort identifier charid
	replace char_shuffled=char if ustrregexm(char,"[0-9]") 
	
	sum `LENGTH'
	local to=r(max)
	forvalues c=`to'(-1)1 {
		if `c'==`to' {
		gen str char`c'=char_shuffled if charid==`c' 
	}
		else {
			local plusone=`c'+1
			gen char`c'=char_shuffled+char`plusone'[_n+1] if charid==`c'		
		}
		}
	
	keep if charid==1
	keep `word' char1
	
	/* one character words are not masked */
	replace char1=`word' if ustrlen(char1)==1
	
	/* test for duplicated masks */
	bysort char1: gen id=_n
	replace char1=ustrreverse(char1) if id>1
	drop id
	
	
	merge 1:m `word' using `TEMP1', nogenerate
	sort sortorder
	
	rename char1 `word'_masked
	drop sortorder
end
*********************************************************************
/* start preparation of files for each of N groups */
local dir `c(pwd)'
local group `1'

use bible_index_`group', clear

local till=_N
local counter=_N

forvalues i=1/`till' {
	use bible_index_`group', clear
	gsort -trans
	local translation=translation[`i']
	local location=ISO[`i']
	noisily di "`translation' (left: `counter') "
	qui {	
	/* per bible book */
	local counter=`counter'-1
	use "`dir'\corpus\\`translation'.dta" , clear
	drop if sentence==""
	
	/* randomize verse order */
	
	set seed 72165
	gen random=runiform()
	sort random
	drop random
	
	/* get bible books */
	gen book=real(usubstr(vid,1,2))
	
	/* create macro containing each available book */
	
	preserve
	gen match=1
	collapse (sum) match, by(book)
	local till=_N
	local books
	forvalues b=1/`till' {
		local add=book[`b']
		local books `"`books' `add'"'
		}
		restore
	
 	/* 
 	Remark: word tokenization is based on the paralleltext.info procedure,
 	except for:
 	
 	khm-x-bible-2011
	khm-x-bible-standard2005
	khm-x-bible-newworld
	cmn-x-bible-cuvmp-simplified
	cmn-x-bible-csb-simplified
	ksw-x-bible
	mya-x-bible-common
	mya-x-bible-1835
	mya-x-bible-newworld
	ang-x-bible
	
 	Reason: for those translations, the words are not separated correctly
 	
 	*/
 	
 	
 	/* one word per line */		
 	if "`translation'"=="khm-x-bible-newworld"|"`translation'"=="cmn-x-bible-cuvmp-simplified"|"`translation'"=="cmn-x-bible-csb-simplified"|"`translation'"=="cmn-x-bible-newsimplified"| "`translation'"=="mya-x-bible-newworld"|"`translation'"=="khm-x-bible-2011"|"`translation'"=="khm-x-bible-standard2005"|"`translation'"=="ang-x-bible"|"`translation'"=="cmn-x-bible-newsimplified"|"`translation'"=="ksw-x-bible"|"`translation'"=="mya-x-bible-common"|"`translation'"=="mya-x-bible-1835" {
 	/*treat dash-separated words as one word */
 	replace sentence=usubinstr(sentence,"-","HYPHEN",.)
 	gen sentid=_n
 	gen sentencelength=ustrwordcount(sentence, "`location'")	
 	expand sentencelength
	sort sentid
	bysort sentid: gen wordid=_n
	gen word=ustrword(sentence,wordid, "`location'")
	replace word=usubinstr(word,"HYPHEN","-",.)
	}
	else {
	gen sentid=_n
	gen sentencelength=wordcount(sentence)	
 	expand sentencelength
	sort sentid
	bysort sentid: gen wordid=_n
	gen word=word(sentence,wordid)
	}	
		
	compress	
	keep word sentid book
	drop if word==""|word==" "
	
	
	save grouped_data_`group', replace
	
	foreach book of local books		{						

				use if book==`book' using grouped_data_`group', clear	


****************************************************************************************************************
				/* mask word_structure */
				 	
				 	mask_it word
				 	rename word word_original
				 	
****************************************************************************************************************					
		/* caluclate H for each book and for all three scenarios 
		(original, with destroyed word structure and with destroyed word order */
					foreach type in original structure order {
					preserve
					if "`type'" =="order" {
					gen random=runiform()
					sort sentid random
					}
					if "`type'" =="structure"  {
						keep word_masked
					}
					if  "`type'" =="original" | "`type'" =="order" {
						keep word_original
					}
										
					outsheet word using "text_`group'.txt", nonames noquote replace
					
					filefilter text_`group'.txt `"`dir'\length\\`translation'_`book'_`type'.txt"', from(\r\n) to(`" "') replace
					restore
					}	
									}

							}
						}
		/* cleaning up */
		foreach file in grouped_data_`group'.dta bible_index_`group'.dta raw_`group'.raw raw2_`group'.raw char_shuffler_`group'.dta zip_`group'.zip text_`group'.txt distorted_`group'.txt text2_`group'.txt raw1_`group'.raw raw_`group'.raw group_`group'.dta id_`group'.dta temp_`group'.dta bible_index_`group'.dta {
		capture erase `file'
	}

/* generate finished file */
clear
set obs 1
gen v=1
save finished_`group', replace

clear         
exit, STATA 	



